Dimension Reduction using Auto Encoder with pytorch

Competetion
Author

신호연

Published

January 12, 2023

import

import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
import numpy as np
import torch
test_path = "./test.csv"
train_path = "./train.csv"
# %pip install plotly (jupyter notebook)
from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
#pio.renderers.default = 'iframe_connected'
#pio.renderers.default = "vscode"
pio.renderers.default = "plotly_mimetype+notebook"

Data

train = pd.read_csv(train_path).drop(columns = ["id"])
train_len = len(train)
test = pd.read_csv(test_path)
id_test = test["id"]
test = pd.read_csv(test_path).drop(columns = ["id"])
dataset = pd.concat([train,test],axis=0)
dataset = dataset.drop(columns = ["father","mother","gender"])
dataset.head(5)
trait SNP_01 SNP_02 SNP_03 SNP_04 SNP_05 SNP_06 SNP_07 SNP_08 SNP_09 SNP_10 SNP_11 SNP_12 SNP_13 SNP_14 SNP_15 class
0 2 G G A G A A G A C A A A A A G G A A G G A G A A A A A A A A B
1 2 A G A G C A A A A A A G A A G A A A A G A A G A G G A A A A C
2 2 G G G G A A G A C C G G A A G A G A A G A A A A A A A A A A B
3 1 A A G G A A G A A A G G G G A A G G A G G G G G G G A A G G A
4 2 G G G G C C A A C C A A A A A A A A G G A A A A A G A A G A C

Preprocessing

_t = []
for val in dataset.SNP_01 == "G G":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has01GG"] = _t

_t = []
for val in dataset.SNP_02 == "A A":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has02AA"] = _t
def create_col(dataset,col,value):
    _t = []
    for val in dataset[col] == value:
        if val == True:
            _t.append(1)
        else:
            _t.append(0)
    
    col_name_base = "has"+col[-2:]
    value_name = ""
    for chr in value:
        if chr != " ":
            value_name+=chr
    col_name = col_name_base+value_name
    #print(col_name)
    dataset[col_name] = _t

    return dataset

dataset = create_col(dataset,"SNP_03","A A")
dataset = create_col(dataset,"SNP_04","G G")
dataset = create_col(dataset,"SNP_05","C C")
dataset = create_col(dataset,"SNP_06","A A")
dataset = create_col(dataset,"SNP_07","A A")
dataset = create_col(dataset,"SNP_07","G G")
dataset = create_col(dataset,"SNP_08","G G")

dataset = create_col(dataset,"SNP_09","A A")
dataset = create_col(dataset,"SNP_09","G G")
dataset = create_col(dataset,"SNP_11","A A")

dataset = create_col(dataset,"SNP_12","A A")
dataset = create_col(dataset,"SNP_12","G G")

dataset = create_col(dataset,"SNP_13","A A")
dataset = create_col(dataset,"SNP_14","A A")
#one-hot encoding for distance base algorithm
dataset_ohe = pd.get_dummies(dataset,columns = dataset.columns.drop("class"),drop_first=True) #multicollinearity를 막기위한 drop_first 옵션
train_ohe = dataset_ohe[:train_len].copy()
test_ohe = dataset_ohe[train_len:].copy().drop(columns="class")

class_map = {"A":0,"B":1,"C":2}
train_ohe["class"]=train_ohe["class"].map(class_map).astype(int)
X_train_ohe = train_ohe.drop(columns = "class")
Y_train_ohe = train_ohe["class"] #X와 통일성을 위해 ohe로 일단 이름지음
Y_train_ohe[:5]
0    1
1    2
2    1
3    0
4    2
Name: class, dtype: int32
X_train_ohe = torch.from_numpy(X_train_ohe.values).float()
#y_train_ohe = torch.from_numpy(pd.get_dummies(Y_train_ohe).values).float()
Y_train_ohe = torch.from_numpy(Y_train_ohe.values).long()

Dimension Reduction using autoencoder with pytorch

class Encoder(nn.Module):
    def __init__(self,in_features,encoding_features):
        super().__init__()
        self.in_features = in_features
        self.encoding_features = encoding_features
        self.linr = torch.nn.Linear(in_features,encoding_features)
        self.active_func = torch.nn.ReLU()
    def forward(self,x):
        out = self.active_func(self.linr(x))
        return out
        
class Decoder(nn.Module):
    def __init__(self,encoding_features,out_features):
        super().__init__()
        self.encoding_features = encoding_features
        self.out_features = out_features
        self.linr = torch.nn.Linear(encoding_features,out_features)
    def forward(self,x):
        out = self.linr(x)
        return out

class AutoEncoder(nn.Module):
    def __init__(self,in_features,encoding_features):
        super().__init__()
        out_features = in_features
        self.encoder = Encoder(in_features,encoding_features)
        self.decoder = Decoder(encoding_features,out_features)
    def forward(self,x):
        out = self.encoder(x)
        out = self.decoder(out)
        return out

encoding dimension=3

training autoencoder

torch.manual_seed(201711375)
autoencoder_3 = AutoEncoder(47,3)
loss_fn = torch.nn.MSELoss()
relu = torch.nn.LeakyReLU()
optimizer = torch.optim.Adam(autoencoder_3.parameters(),lr=0.001)
for epoch in range(20000):
    #1.yhat
    out = autoencoder_3(X_train_ohe)
    #2
    loss = loss_fn(out,X_train_ohe)
    #3
    loss.backward()
    if epoch % 10000 == 0:
        print(f"epoch:{epoch} loss:{loss.tolist()}")
    #4
    optimizer.step()
    optimizer.zero_grad()
epoch:0 loss:0.5274774432182312
epoch:10000 loss:0.11526338756084442

visualization

class_map_inv = {}
for key,value in class_map.items():
    class_map_inv[value] = key
class_map_inv
{0: 'A', 1: 'B', 2: 'C'}
dt_dim3 = pd.DataFrame({"class":Y_train_ohe})
dt_dim3 = pd.concat([pd.DataFrame(np.array(autoencoder_3.encoder(X_train_ohe).tolist())),dt_dim3],axis=1)
dt_dim3 = dt_dim3.rename(columns = {0:"x",1:"y",2:"z"})
count = 0
data = []
for cl in dt_dim3["class"].unique():
    cond = dt_dim3["class"] == cl
    _data = dt_dim3.loc[cond,:]
    x = _data.x.tolist()
    y = _data.y.tolist()
    z = _data.z.tolist()
    if count == 0:
        color = "red"
    elif count == 1:
        color = "blue"
    else:
        color = "black"
    trace=go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode="markers",
        marker = dict(color = color,size=2),
        name = str(class_map_inv[cl])
        )
    data.append(trace)
    count+=1

layout = go.Layout(title=dict(text = "3-dimension "))

#4. figure
fig = go.Figure(data=data,layout=layout)
fig.show()

encoding dimension=10

torch.manual_seed(201711375)
autoencoder_3 = AutoEncoder(47,10)
loss_fn = torch.nn.MSELoss()
relu = torch.nn.LeakyReLU()
optimizer = torch.optim.Adam(autoencoder_3.parameters(),lr=0.001)
for epoch in range(40000):
    #1.yhat
    out = autoencoder_3(X_train_ohe)
    #2
    loss = loss_fn(out,X_train_ohe)
    #3
    loss.backward()
    if epoch % 10000 == 0:
        print(f"epoch:{epoch} loss:{loss.tolist()}")
    #4
    optimizer.step()
    optimizer.zero_grad()
epoch:0 loss:0.3828417658805847
epoch:10000 loss:0.060432590544223785
epoch:20000 loss:0.06043253839015961
epoch:30000 loss:0.060432031750679016

참고링크

링크1
링크2